1 Data Processing

This RMarkdown contains the code used for analysis for the subsection Data Reuse in Metabolomics of Chapter 3 of the thesis Fit for purpose? A metascientific analysis of metabolomics data in public repositories.

1.1 Functions

This sections contains functions that are used for processing the reuse data.

# Functions Extract which studies have been reused
ReusedStudies <- function(x) {
    Studies <- as.character(x$`Study(-ies)`)
    Studies <- unlist(strsplit(Studies, ","))
    # remove instances where a single study is not referenced
    Studies <- Studies[!grepl("-", Studies)]
    Studies <- Studies[!grepl("All", Studies)]
    Studies <- Studies[!grepl("GNPS", Studies)]
    return(Studies)
}

# Produce an ordered frequency table of the number of reuses per study
UniqueStudiesCount <- function(x) {
    Studies <- ReusedStudies(x)
    # Produce a frequency table of the number of reuses per study
    UniqueStudiesCount <- as.data.frame(table(Studies))
    # Order by Frequency of reuse
    UniqueStudiesCountOrdered <- UniqueStudiesCount[order(-UniqueStudiesCount$Freq), 
        ]
    return(UniqueStudiesCountOrdered)
}

# Find the count of MetaboLights studies being reused
MetaboLightsReuse <- function(x) {
    MetaboLightsC <- x[grepl("MTBLS", x)]
    MetaboLightsCount <- as.data.frame(table(MetaboLightsC))
    MetaboLightsCount <- MetaboLightsCount[order(-MetaboLightsCount$Freq), ]
    MetaboLightsCount$Repository <- rep("MetaboLights", nrow(MetaboLightsCount))
    colnames(MetaboLightsCount) <- c("Study", "Frequency", "Repository")
    return(MetaboLightsCount)
}

# Find the count of Metabolomics Workbench studies being reused
MWReuse <- function(x) {
    MetabolomicsWorkbenchC <- x[grepl("ST000", x)]
    MetabolomicsWorkbenchCount <- as.data.frame(table(MetabolomicsWorkbenchC))
    MetabolomicsWorkbenchCount <- MetabolomicsWorkbenchCount[order(-MetabolomicsWorkbenchCount$Freq), 
        ]
    MetabolomicsWorkbenchCount$Repository <- rep("Metabolomics Workbench", nrow(MetabolomicsWorkbenchCount))
    colnames(MetabolomicsWorkbenchCount) <- c("Study", "Frequency", "Repository")
    return(MetabolomicsWorkbenchCount)
}

# Find the count of GNPS studies being reused
GNPSReuse <- function(x) {
    GNPSC <- x[grepl("MSV", x)]
    GNPSCount <- as.data.frame(table(GNPSC))
    GNPSCount <- GNPSCount[order(-GNPSCount$Freq), ]
    GNPSCount$Repository <- rep("GNPS", nrow(GNPSCount))
    colnames(GNPSCount) <- c("Study", "Frequency", "Repository")
    return(GNPSCount)
}

# Combine the amount of reuse per repository into a single data frame
# Requires input of the number of GNPS, MetaboLights and Metabolomics
# Workbench studies which have not been reused
CombStudies <- function(Studylist, GNPS, ML, MW) {
    GNPSs <- GNPSReuse(Studylist)
    MLs <- MetaboLightsReuse(Studylist)
    MWs <- MWReuse(Studylist)
    AllStudies <- rbind(GNPSs, MLs, MWs)
    MLnr <- data.frame(Study = rep("Study", ML), Frequency = rep(0, ML), Repository = rep("MetaboLights"))
    # MW 609 - nrow(MetabolomicsWorkbenchCount) = 364
    MWnr <- data.frame(Study = rep("Study", MW), Frequency = rep(0, MW), Repository = rep("Metabolomics Workbench"))
    # GNPS 688 - nrow(GNPSCount) = 604
    GNPSnr <- data.frame(Study = rep("Study", GNPS), Frequency = rep(0, GNPS), 
        Repository = rep("GNPS"))
    # Combine into single dataframe
    AllStudies <- rbind(AllStudies, MLnr, MWnr, GNPSnr)
    AllStudies$Repository <- as.factor(AllStudies$Repository)
    levels(AllStudies$Repository) <- gsub(" ", "\n", levels(AllStudies$Repository))
    return(AllStudies)
}

# Create data frame of the percentage of reuse at each frequency
PerReuse <- function(x) {
    # The number of studies with each frequency of reuse
    Per0 <- nrow(x[x$Frequency == 0, ])/nrow(x) * 100
    Per1 <- nrow(x[x$Frequency == 1, ])/nrow(x) * 100
    Per2 <- nrow(x[x$Frequency == 2, ])/nrow(x) * 100
    Per3 <- nrow(x[x$Frequency == 3, ])/nrow(x) * 100
    Per4 <- nrow(x[x$Frequency == 4, ])/nrow(x) * 100
    df <- data.frame(FrequencyofReuse = c(0, 1, 2, 3, 4), Percentage = c(Per0, 
        Per1, Per2, Per3, Per4))
    return(df)
}

# Create a data frame of the reuse per repository and the reuse across
# repositories
PerReuseRepository <- function(x) {
    GNPS <- x[x$Repository == "GNPS", ]
    ML <- x[x$Repository == "MetaboLights", ]
    MW <- x[x$Repository == "Metabolomics\nWorkbench", ]
    GNPSPer <- PerReuse(GNPS)
    MLPer <- PerReuse(ML)
    MWPer <- PerReuse(MW)
    AllPer <- PerReuse(x)
    df = cbind(GNPSPer, MLPer[, 2], MWPer[, 2], AllPer[, 2])
    colnames(df) <- c("Frequency", "GNPS", "MetaboLights", "Metabolomics Workbench", 
        "All Studies")
    return(df)
}

# Percentage of studies published in each year that have been reused
# Requires input of list of studies that have been reused and all studies in
# repository
AgeReuse <- function(Repos, ReStud) {
    # Reused studies
    Reused <- Repos[Repos$StudyID %in% ReStud$Studies, ]
    # Not reused studies
    Not <- Repos[!Repos$StudyID %in% ReStud$Studies, ]
    # Publication Year of studies that have been reused
    ReusedAge <- as.data.frame(table(Reused$Year))
    # Publication Year of studies that have not been reused
    NotAge <- as.data.frame(table(Not$Year))
    df <- cbind(ReusedAge, NotAge[1:nrow(ReusedAge), 2])
    colnames(df) <- c("Year", "Reused", "Not")
    # Calculate percentage of studies reused per year
    df$Percentage <- round(df$Reused/(df$Reused + df$Not) * 100, 2)
    # Remove frequency columns
    df <- df[, -(2:3)]
    return(df)
}

1.2 Reuse per Year

Calculate the number of publications that reuse data from each repository per year.

# Load Reuse data
DataReuse <- read.csv("../data/DataReuseMetabolomics.csv", check.names = FALSE)
DataReuse$Year <- as.factor(DataReuse$Year)

# Convert data re-use per year to a factor (including 0 for the year 2012)
DataReuse$Year <- factor(DataReuse$Year, levels = c("2012", levels(DataReuse$Year)))
# Find the frequency of data reuse per year
DataReuseYear <- as.data.frame(table(DataReuse$Year))
colnames(DataReuseYear) <- c("Year", "Frequency")
DataReuseYear$Repository <- rep("All")

# Count number of reuses per repository MetaboLights
MLNo <- DataReuse[grepl("MetaboLights", DataReuse$Repository), ]
# Metabolomics Workbench
MWNo <- DataReuse[grepl("Metabolomics Workbench", DataReuse$Repository), ]
# GNPS
GNPSNo <- DataReuse[grepl("GNPS", DataReuse$Repository), ]

# Create contigency tables of the frequency of reuse per year adding a level
# for the year prior to any examples of reuse for plotting MetaboLights
MLYear <- as.data.frame(table(MLNo$Year))
colnames(MLYear) <- c("Year", "Frequency")
MLYear$Repository <- rep("MetaboLights")
MLYear$Year <- as.factor(MLYear$Year)
MLYear$Year <- factor(MLYear$Year, levels = c("2012", levels(MLYear$Year)))
# Metabolomics Workbench
MWYear <- as.data.frame(table(MWNo$Year))
colnames(MWYear) <- c("Year", "Frequency")
MWYear$Repository <- rep("Metabolomics Workbench")
MWYear$Year <- as.factor(MWYear$Year)
MWYear$Year <- factor(MWYear$Year, levels = c("2016", levels(MWYear$Year)))
# GNPS
GNPSYear <- as.data.frame(table(GNPSNo$Year))
colnames(GNPSYear) <- c("Year", "Frequency")
GNPSYear$Repository <- rep("GNPS")
GNPSYear$Year <- as.factor(GNPSYear$Year)
GNPSYear$Year <- factor(GNPSYear$Year, levels = c("2015", levels(GNPSYear$Year)))

# Recombine data into single data frame
Freqperyear <- rbind(DataReuseYear, MLYear, MWYear, GNPSYear)
Freqperyear$Repository <- as.factor(Freqperyear$Repository)

# Remove 2018 for ploting
Y2018 <- grepl("2018", Freqperyear$Year)
No2018 <- Freqperyear[!Y2018, ]

1.3 Reuse per Study

Calculate the frequency of reuse of each study.

# Find the total number of studies being reused (including reuse in Spicer
# et al. 2018)
TotalUniqueStudies <- ReusedStudies(DataReuse)

# As of 15/2/18 there are 329 MetaboLghts studies, 614 Metabolomics
# Workbench studies and 691 GNPS studies GNPS 691 - nrow(GNPSCount) = 606 ML
# 332 - nrow(MetaboLightsCount) = 187 MW 614 - nrow(MWCount) = 369
AllStudies <- CombStudies(TotalUniqueStudies, 606, 187, 369)

# Find the percentage reuse at each frequency across all repositories
Reuseper <- PerReuseRepository(AllStudies)
# melt data for ploting
Reuseperg <- melt(Reuseper, "Frequency")
colnames(Reuseperg) <- c("Frequency", "Repository", "Percentage")
Reuseperg$Frequency <- factor(Reuseperg$Frequency, levels = c("4", "3", "2", 
    "1", "0"))
levels(Reuseperg$Repository) <- gsub(" ", "\n", levels(Reuseperg$Repository))

# Find the total number of studies being reused (excluding reuse in Spicer
# et al. 2018)
TotalUniqueStudiesnoSpicer <- ReusedStudies(DataReuse[!grepl("Spicer", DataReuse$`Author(-s)`), 
    ])

# As of 15/2/18 there are 329 MetaboLghts studies, 614 Metabolomics
# Workbench studies and 691 GNPS studies GNPS 691 - nrow(GNPSCountnoSpicer)
# = 606 ML 329 - nrow(MetaboLightsCountnoSpicer) = 275 MW 614 -
# nrow(MWCountnoSpicer) = 592
AllStudiesnospicer <- CombStudies(TotalUniqueStudiesnoSpicer, 606, 275, 592)

# Find the percentage reuse at each frequency across all repositories
Reusepernospicer <- PerReuseRepository(AllStudiesnospicer)
# melt data for ploting
Reusepernospicerg <- melt(Reusepernospicer, "Frequency")
colnames(Reusepernospicerg) <- c("Frequency", "Repository", "Percentage")
Reusepernospicerg$Frequency <- factor(Reusepernospicerg$Frequency, levels = c("4", 
    "3", "2", "1", "0"))
levels(Reusepernospicerg$Repository) <- gsub(" ", "\n", levels(Reusepernospicerg$Repository))

1.4 Same Authors

Studies were examined to see whether they reused data produced by the same set of authors, submitters or study owners. This is important as one of the most common reasons researchers cite for not sharing data is the fear that they will be able to generate less publications from their data, and other researchers will scoop them.

Percentage of studies that reuse data that share at least one author/submitter/study owner with the original study:

# Calculate percentage of studies where the original authors have reused
# their own data
SameAuth <- round(sum(DataReuse$`Same Group`)/(nrow(DataReuse)) * 100, digits = 2)
SameAuth
## [1] 47.06

1.5 Time until Reuse

Calculate the time between data being made public and its reuse. The average number of years is:

# Load repository data MetaboLights
ML <- read.csv("../data/MLStudiesTime.csv", check.names = FALSE, stringsAsFactors = FALSE)
ML$Year <- format(as.Date(ML$StudyPublicationDate, "%d/%m/%y"), "%Y")
# Metabolomics Workbench
MW <- read.csv("../data/MWStudiesTime.csv", check.names = FALSE, stringsAsFactors = FALSE)
MW$Year <- format(as.Date(MW$ReleaseDate, "%d/%m/%Y"), "%Y")
# GNPS
GNPS <- read.csv("../data/GNPSStudiesTime.csv", check.names = FALSE, stringsAsFactors = FALSE)
GNPS$Year <- format(as.Date(GNPS$`Upload Date`, "%b. %d,%Y"), "%Y")

# Split DataReuse by study, but maintain details of reuse
UniqueStudiesReuse <- DataReuse %>% mutate(`Study(-ies)` = strsplit(as.character(`Study(-ies)`), 
    ",")) %>% unnest(`Study(-ies)`)
# rename colname `Study(-ies)` to StudyID for joining
colnames(UniqueStudiesReuse)[9] <- "StudyID"
# rename year to ReuseYear
colnames(UniqueStudiesReuse)[3] <- "ReuseYear"

# Create data frames of just StudyID and Year
MLSY <- as.data.frame(cbind(ML[, 1], ML[, 7]))
MWSY <- as.data.frame(cbind(MW[, 1], MW[, 10]))
GNPSSY <- as.data.frame(cbind(GNPS[, 2], GNPS[, 10]))
# join data frames
StIDY <- rbind(MLSY, MWSY, GNPSSY)
colnames(StIDY) <- c("StudyID", "Year")

# add year to studies
UniqueStudiesReuseYear <- left_join(UniqueStudiesReuse, StIDY, by = "StudyID")
# Convert years from factor to numeric
UniqueStudiesReuseYear$Year <- as.numeric(as.character(UniqueStudiesReuseYear$Year))
UniqueStudiesReuseYear$ReuseYear <- as.numeric(as.character(UniqueStudiesReuseYear$ReuseYear))
# Calculate time between data being published and reuse
UniqueStudiesReuseYear$Time2Reuse <- UniqueStudiesReuseYear$ReuseYear - UniqueStudiesReuseYear$Year

# Create contigency table
Time2Reuse <- as.data.frame(table(UniqueStudiesReuseYear$Time2Reuse))
colnames(Time2Reuse) <- c("Years", "Frequency")
Time2Reuse$Percentage <- Time2Reuse$Frequency/sum(Time2Reuse$Frequency) * 100

# Average time until reuse
AveTime <- round(mean(UniqueStudiesReuseYear$Time2Reuse, na.rm = T), 2)
AveTime
## [1] 1.8

1.6 Time Data made Public

Calculate the time until data reuse per study in each repository.

# Unique Studies that have been reused + Frequency
AllStudiesReused <- UniqueStudiesCount(DataReuse)

# Percentage reused Metabolights studies
MLAge <- AgeReuse(ML, AllStudiesReused)

# Percentage reused Metabolomics Workbench studies
MWAge <- AgeReuse(MW, AllStudiesReused)

# Percentage reused Metabolomics Workbench studies
GNPSAge <- AgeReuse(GNPS, AllStudiesReused)

# Create contigency tables of the frequency of reuse per year adding a level
# for the year prior to any examples of reuse for plotting MetaboLights
MLAge$Repository <- rep("MetaboLights")
MLAge$Year <- as.factor(MLAge$Year)
# Metabolomics Workbench
MWAge$Repository <- rep("Metabolomics Workbench")
MWAge$Year <- as.factor(MWAge$Year)
# GNPS
GNPSAge$Repository <- rep("GNPS")
GNPSAge$Year <- as.factor(GNPSAge$Year)

# Recombine data into single data frame
PerAge <- rbind(MLAge, MWAge, GNPSAge)
PerAge$Repository <- as.factor(PerAge$Repository)
# Add row showing 0 reuse for GNPS in 2017
GNPS2017 <- c("2017", 0, "GNPS")
PerAge <- rbind(PerAge, GNPS2017)
PerAge$Percentage <- as.numeric(PerAge$Percentage)

# Reuse excluding Spicer et al, (2017)
NoSpicerReused <- UniqueStudiesCount(DataReuse[!grepl("Spicer", DataReuse$`Author(-s)`), 
    ])

# Percentage reused Metabolights studies
MLNSAge <- AgeReuse(ML, NoSpicerReused)

# Percentage reused Metabolomics Workbench studies
MWNSAge <- AgeReuse(MW, NoSpicerReused)

# Percentage reused Metabolomics Workbench studies
GNPSNSAge <- AgeReuse(GNPS, NoSpicerReused)

# Create contigency tables of the frequency of reuse per year adding a level
# for the year prior to any examples of reuse for plotting MetaboLights
MLNSAge$Repository <- rep("MetaboLights")
MLNSAge$Year <- as.factor(MLNSAge$Year)
# Metabolomics Workbench
MWNSAge$Repository <- rep("Metabolomics Workbench")
MWNSAge$Year <- as.factor(MWNSAge$Year)
# GNPS
GNPSNSAge$Repository <- rep("GNPS")
GNPSNSAge$Year <- as.factor(GNPSNSAge$Year)

# Recombine data into single data frame
PerNSAge <- rbind(MLNSAge, MWNSAge, GNPSNSAge)
PerNSAge$Repository <- as.factor(PerNSAge$Repository)
# Add row showing 0 reuse for Metabolomics Workbench and GNPS in 2017
MW2017 <- c("2017", 0, "Metabolomics Workbench")
PerNSAge <- rbind(PerNSAge, MW2017, GNPS2017)
PerNSAge$Percentage <- as.numeric(PerNSAge$Percentage)

Calculate the number of studies released per repository per year.

# Average deposition date of MetaboLights studies
MLSDT <- round(mean(as.numeric(ML$Year), na.rm = T), 2)

# Average deposition date of Metabolomics Workbench studies
MWSDT <- round(mean(as.numeric(MW$Year), na.rm = T), 2)

# Average deposition date of GNPS studies
GNPSDT <- round(mean(as.numeric(GNPS$Year), na.rm = T), 2)

# Average deposition date of all studies
Years <- cbind(ML$Year, MW$Year, GNPS$Year)
YearsDT <- round(mean(as.numeric(Years), na.rm = T), 2)

# Distribution of age of studies MetaboLights
MLSYear <- as.data.frame(table(ML$Year))
colnames(MLSYear) <- c("Year", "Frequency")
MLSYear$Repository <- rep("MetaboLights")
# Metabolomics Workbench
MWSYear <- as.data.frame(table(MW$Year))
colnames(MWSYear) <- c("Year", "Frequency")
MWSYear$Repository <- rep("Metabolomics Workbench")
# GNPS
GNPSSYear <- as.data.frame(table(GNPS$Year))
colnames(GNPSSYear) <- c("Year", "Frequency")
GNPSSYear$Repository <- rep("GNPS")
# All studies
AllSYears <- as.data.frame(table(Years))
colnames(AllSYears) <- c("Year", "Frequency")
AllSYears$Repository <- rep("All")

# Recombine data into single data frame
StudYear <- rbind(AllSYears, MLSYear, MWSYear, GNPSSYear)
StudYear$Repository <- as.factor(StudYear$Repository)

# Remove 2018, 2019 and 2050 for ploting
y2018 <- grepl("2018", StudYear$Year)
Till2017 <- StudYear[!y2018, ]
y2019 <- grepl("2019", Till2017$Year)
Till2017 <- Till2017[!y2019, ]
y2050 <- grepl("2050", Till2017$Year)
Till2017 <- Till2017[!y2050, ]

# Add zero years for plotting
MW0 <- data.frame(Year = 2012, Frequency = 0, Repository = "Metabolomics Workbench")
GNPS0 <- data.frame(Year = 2013, Frequency = 0, Repository = "GNPS")

# recombine data frames
Till2017$Year <- factor(Till2017$Year, levels = c("2011", levels(Till2017$Year)))
Till2017 <- rbind(Till2017, MW0, GNPS0)

2 Tables

2.1 Table 3.7. Studies that reuse public available metabolomics data

Paper <- DataReuse$Link
kable(cbind(DataReuse[, 1:4], Paper), caption = "Studies that reuse public available metabolomics data, as of 15^th^ February 2018. The table shows articles that reuse publicly available data, the repository(-ies) that were the source of the data, the year the article was published and classification of how the data were re-used.") %>% 
    kable_styling(full_width = F, bootstrap_options = c("hover", "responsive"))
Studies that reuse public available metabolomics data, as of 15th February 2018. The table shows articles that reuse publicly available data, the repository(-ies) that were the source of the data, the year the article was published and classification of how the data were re-used.
Title Repository Year Classification Paper
Predicting Network Activity from High Throughput Metabolomics MetaboLights 2013 Methods https://doi.org/10.1371/journal.pcbi.1003123
The Risa R/Bioconductor package: integrative data analysis from experimental metadata and back again. MetaboLights 2014 Software https://doi.org/10.1186%2F1471-2105-15-S1-S11
PredRet: Prediction of Retention Time by Direct Mapping between Multiple Chromatographic Systems MetaboLights 2015 Software https://doi.org/10.1021/acs.analchem.5b02287
The influence of scaling metabolomics data on model classification accuracy MetaboLights 2015 Methods https://doi.org/10.1007/s11306-014-0738-7
Joint Analysis of Dependent Features within Compound Spectra Can Improve Detection of Differential Features MetaboLights 2015 Methods https://doi.org/10.3389/fbioe.2015.00129
BiNChE: A web tool and library for chemical enrichment analysis based on the ChEBI ontology MetaboLights 2015 Resource https://doi.org/10.1186/s12859-015-0486-3
Approaches to sample size determination for multivariate data: Applications to PCA and PLS-DA of omics data MetaboLights 2016 Methods https://doi.org/10.1021/acs.jproteome.5b01029
Galaxy-M: a Galaxy workflow for processing and analyzing direct infusion and liquid chromatography mass spectrometry-based metabolomics data MetaboLights 2016 Software https://doi.org/10.1186/s13742-016-0115-8
Performance Evaluation and Online Realization of Data-driven Normalization Methods Used in LC/MS based Untargeted Metabolomics Analysis MetaboLights 2016 Software https://doi.org/10.1038/srep38881
Effect of Insulin Resistance on Monounsaturated Fatty Acid Levels: A Multi-cohort Non-targeted Metabolomics and Mendelian Randomization Study MetaboLights 2016 Biological studies https://doi.org/10.1371/journal.pgen.1006379
Non-targeted metabolomics combined with genetic analyses identifies bile acid synthesis and phospholipid metabolism as being associated with incident type 2 diabetes MetaboLights 2016 Biological studies https://doi.org/10.1007/s00125-016-4041-1
Partial least squares with structured output for modelling the metabolomics data obtained from complex experimental designs: A study into the Y-block coding MetaboLights 2016 Methods https://doi.org/10.3390/metabo6040038
Dereplication of peptidic natural products through database search of mass spectra GNPS 2016 Methods https://doi.org/10.1038/nchembio.2219
DES-ncRNA: A knowledgebase for exploring information about human micro and long noncoding RNAs based on literature-mining MetaboLights 2017 Resource https://doi.org/10.1080/15476286.2017.1312243
DES-TOMATO: A Knowledge Exploration System Focused On Tomato Species MetaboLights 2017 Resource https://doi.org/10.1038/s41598-017-05448-0
MsPurity: Automated Evaluation of Precursor Ion Purity for Mass Spectrometry-Based Fragmentation in Metabolomics MetaboLights 2017 Software https://doi.org/10.1021/acs.analchem.6b04358
NOREVA: normalization and evaluation of MS-based metabolomics data MetaboLights 2017 Software https://doi.org/10.1093/nar/gkx449
Untargeted metabolomics suffers from incomplete data analysis MetaboLights, Metabolomics Workbench 2017 Methods https://doi.org/10.1007/s11306-017-1246-3
Joint Bounding of Peaks Across Samples Improves Differential Analysis in Mass Spectrometry-Based Metabolomics MetaboLights 2017 Software https://doi.org/10.1021/acs.analchem.6b04719
LiverWiki: a wiki-based database for human liver MetaboLights, Metabolomics Workbench 2017 Resource https://doi.org/10.1186/s12859-017-1852-0
mzML2ISA & nmrML2ISA: generating enriched ISA-Tab metadata files from metabolomics XML data MetaboLights 2017 Software https://doi.org/10.1093/bioinformatics/btx169
Mass Spectral Feature List Optimizer (MS-FLO): A Tool To Minimize False Positive Peak Reports in Untargeted Liquid Chromatography-Mass Spectroscopy (LC-MS) Data Processing Metabolomics Workbench 2017 Software https://doi.org/10.1021/acs.analchem.6b04372
xMSannotator: an R package for network-based annotation of high-resolution metabolomics data Metabolomics Workbench 2017 Software https://doi.org/10.1021/acs.analchem.6b01214
Distribution based nearest neighbor imputation for truncated high dimensional data with applications to pre-clinical and clinical metabolomics studies Metabolomics Workbench 2017 Methods https://doi.org/10.1186/s12859-017-1547-6
Significance estimation for large scale untargeted metabolomics annotations GNPS 2017 Methods https://doi.org/10.1038/s41467-017-01318-5
Proposal for a common nomenclature for fragment ions in mass spectra of lipids. MetaboLights 2017 Metadata https://doi.org/10.1371/journal.pone.0188394
Metadata analyser: Measuring metadata quality MetaboLights 2017 Metadata https://doi.org/10.1007/978-3-319-60816-7_24
Assessing Public Metabolomics Metadata, Towards Improving Quality. MetaboLights 2017 Metadata https://doi.org/10.1515/jib-2017-0054
Molecular structures enumeration and virtual screening in the chemical space with RetroPath2.0. MetaboLights 2017 Resource https://doi.org/10.1186/s13321-017-0252-9
Chemical Similarity Enrichment Analysis (ChemRICH) as alternative to biochemical pathway mapping for metabolomic datasets Metabolomics Workbench 2017 Resource https://doi.org/10.1038/s41598-017-15231-w
Meta-mass shift chemical profiling of metabolomes from coral reefs. GNPS 2017 Methods https://doi.org/10.1073/pnas.1710248114
Compliance with minimum information guidelines in public metabolomics repositories MetaboLights, Metabolomics Workbench, MetaPhen, MeRy-B 2017 Metadata https://doi.org/10.1038/sdata.2017.137
Evaluation and comparison of bioinformatic tools for the enrichment analysis of metabolomics data MetaboLights, Metabolomics Workbench 2018 Methods https://doi.org/10.1186/s12859-017-2006-0
Increased diversity of peptidic natural products revealed by modification-tolerant database search of mass spectra GNPS 2018 Methods https://doi.org/10.1038/s41564-017-0094-2

3 Figures

Code that was used to generate raw figures. Figures were further processed in Adobe Illustrator.

3.1 Figure 3.10. The reuse of metabolomics data over time.

Data reuse across all repositories is shown in red, GNPS reuse is shown in blue, MetaboLights reuse is shown in green and Metabolomics Workbench is shown in purple. The launch year of each repository is also highlighted.

ggplot(No2018, aes(Year, Frequency, color=Repository, group=Repository))  + 
  geom_line() +
  geom_point() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 20)) +
  geom_segment(aes(x = 1, y = 0, xend = 1, yend = 18), linetype="dashed", color = "black", size=0.2) +
  geom_segment(aes(x = 2, y = 0, xend = 2, yend = 17), linetype="dashed", color = "black", size=0.2) +
  geom_segment(aes(x = 3, y = 0, xend = 3, yend = 18), linetype="dashed", color = "black", size=0.3) +
  theme_bw() +
  theme(
    legend.position="top",
    axis.text = element_text(colour = "black"),
    axis.line.x = element_line(color="black", size = 0.5),
    axis.line.y = element_line(color="black", size = 0.5),
    # Remove gridlines and borders
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    legend.title.align=0.5) +
  scale_color_manual(values = c("#e41a1c","#377eb8", "#4daf4a", "#984ea3")) +
  guides(color = guide_legend(title.position = "top")) +
  annotate("text", x = 1, y = 19, label = "MetaboLights \n launched") +
  annotate("text", x = 2, y = 18.5, label = "Metabolomics \n Workbench \n launched") +
  annotate("text", x = 3, y = 19, label = "GNPS \n launched")

3.2 Figure 3.11A. The percentage of studies reused at each frequency including all studies

The percentage of studies reused at each frequency: 0, 1, 2, 3, or 4 times, as of 15th February 2018, including reuse in all studies.

ggplot(Reuseperg, aes(x=Repository, y=Percentage, fill=Frequency))  +
  geom_bar(colour="black", stat="identity", width = 0.7) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 101)) +
  scale_fill_manual(values = c('#0868ac','#43a2ca','#7bccc4',"#bae4bc",'#f0f9e8')) +
  ylab("Frequency of Reuse per Study (%)")+
  theme_bw() +
  theme(axis.text = element_text(colour = "black"),
        legend.text.align = 0,
        #legend.text = element_text(face = "italic"),
        axis.line.x = element_line(color="black", size = 0.5),
        axis.line.y = element_line(color="black", size = 0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank()) 

3.3 Figure 3.11B. The percentage of studies reused at each frequency excluding Spicer et al. (2017)

The percentage of studies reused at each frequency: 0, 1, 2, 3, or 4 times, as of 15th February 2018 excluding reuse by Spicer (2017).

ggplot(Reusepernospicerg, aes(x=Repository, y=Percentage, fill=Frequency))  +
  geom_bar(colour="black", stat="identity", width = 0.7) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 101)) +
  scale_fill_manual(values = c('#0868ac','#43a2ca','#7bccc4',"#bae4bc",'#f0f9e8')) +
  ylab("Frequency of Reuse per Study (%)")+
  theme_bw() +
  theme(axis.text = element_text(colour = "black"),
        legend.text.align = 0,
        #legend.text = element_text(face = "italic"),
        axis.line.x = element_line(color="black", size = 0.5),
        axis.line.y = element_line(color="black", size = 0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank()) 

3.4 Figure 3.12A. The frequency of metabolomics studies released per year

The total number of studies released per year is shown in red, GNPS studies are in blue, MetaboLights are in green and Metabolomics Workbench are in purple.

ggplot(Till2017, aes(Year, Frequency, color=Repository, group=Repository))  + 
  geom_line() +
  geom_point() +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 1000)) +
  theme_bw() +
  theme(
    legend.position="top",
    axis.text = element_text(colour = "black"),
    axis.line.x = element_line(color="black", size = 0.5),
    axis.line.y = element_line(color="black", size = 0.5),
    # Remove gridlines and borders
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    legend.title.align=0.5) +
  scale_color_manual(values = c("#e41a1c","#377eb8", "#4daf4a", "#984ea3")) +
  guides(color = guide_legend(title.position = "top"))

3.5 Figure 3.12B. The frequency of publicly available studies and time until data reuse

ggplot(Time2Reuse, aes(Years, Percentage, group = 1))  + 
  geom_line() +
  geom_point() +
  ylab("Percentage of Reused Studies") +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 40)) +
  theme_bw() +
  theme(
    legend.position="top",
    axis.text = element_text(colour = "black"),
    axis.line.x = element_line(color="black", size = 0.5),
    axis.line.y = element_line(color="black", size = 0.5),
    # Remove gridlines and borders
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    legend.title.align=0.5)

3.6 Supplementary Figure 1. Types of Reuse

# Extract the frequency of types of reuse
TypeReuse <- as.data.frame(table(DataReuse$Classification))
colnames(TypeReuse) <- c("Classification", "Frequency")
TypeReuse$Percentage <- TypeReuse$Frequency/sum(TypeReuse$Frequency) * 100
TypeReuseOrdered <- TypeReuse[order(-TypeReuse$Percentage),]

colors <- c('rgb(211,94,96)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')

plot_ly(TypeReuseOrdered, labels = ~Classification, values = ~Percentage, type = 'pie',
        textposition = 'inside',
        textinfo = 'label+percent',
        insidetextfont = list(color = '#FFFFFF'),
        hoverinfo = 'text',
        text = ~paste(Frequency, "Studies"),
        marker = list(colors = colors,
                      line = list(color = '#FFFFFF', width = 1)),
        #The 'pull' attribute can also be used to create space between the sectors
        showlegend = FALSE) %>%
  layout(title = 'Study Classification',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

3.7 Supplementary Figure 2. The percentage of studies published per year that have been reused including all studies

ggplot(PerAge, aes(Year, Percentage, color=Repository, group=Repository))  + 
  geom_line() +
  geom_point() +
  ylab("Reused Studies (%)") +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 100)) +
  geom_segment(aes(x = 1, y = 0, xend = 1, yend = 49), linetype="dashed", color = "black", size=0.3) +
  geom_segment(aes(x = 2, y = 0, xend = 2, yend = 49), linetype="dashed", color = "black", size=0.15) +
  geom_segment(aes(x = 3, y = 0, xend = 3, yend = 22), linetype="dashed", color = "black", size=0.35) +
  theme_bw() +
  theme(
    legend.position="top",
    axis.text = element_text(colour = "black"),
    axis.line.x = element_line(color="black", size = 0.5),
    axis.line.y = element_line(color="black", size = 0.5),
    # Remove gridlines and borders
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    legend.title.align=0.5) +
  scale_color_manual(values = c("#377eb8", "#4daf4a", "#984ea3")) +
  guides(color = guide_legend(title.position = "top")) +
  annotate("text", x = 1.5, y = 45, label = "MetaboLights \n launched") +
  annotate("text", x = 2.55, y = 42.5, label = "Metabolomics \n Workbench \n launched") +
  annotate("text", x = 3.35, y = 18, label = "GNPS \n launched")

3.8 Supplementary Figure 3. The percentage of studies published per year that have been reused excluding Spicer et al. (2017)

ggplot(PerNSAge, aes(Year, Percentage, color=Repository, group=Repository))  + 
  geom_line() +
  geom_point() +
  ylab("Reused Studies (%)") +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 100)) +
  geom_segment(aes(x = 1, y = 0, xend = 1, yend = 49), linetype="dashed", color = "black", size=0.3) +
  geom_segment(aes(x = 2, y = 0, xend = 2, yend = 6), linetype="dashed", color = "black", size=0.1) +
  geom_segment(aes(x = 3, y = 0, xend = 3, yend = 22), linetype="dashed", color = "black", size=0.35) +
  theme_bw() +
  theme(
    legend.position="top",
    axis.text = element_text(colour = "black"),
    axis.line.x = element_line(color="black", size = 0.5),
    axis.line.y = element_line(color="black", size = 0.5),
    # Remove gridlines and borders
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    legend.title.align=0.5) +
  scale_color_manual(values = c("#377eb8", "#4daf4a", "#984ea3")) +
  guides(color = guide_legend(title.position = "top")) +
  annotate("text", x = 1, y = 56, label = "MetaboLights \n launched") +
  annotate("text", x = 2, y = 16, label = "Metabolomics \n Workbench \n launched") +
  annotate("text", x = 3, y = 30, label = "GNPS \n launched")

3.9 Supplementary Table 1. The frequency of reuse per study

ReuseStud <- UniqueStudiesCount(DataReuse)
colnames(ReuseStud) <- c("Study", "Frequency")
kable(ReuseStud, row.names = F) %>% kable_styling(full_width = F)
Study Frequency
MTBLS36 4
MTBLS93 4
MTBLS1 3
MTBLS124 3
MTBLS126 3
MTBLS146 3
MTBLS17 3
MTBLS2 3
MTBLS20 3
MTBLS214 3
MTBLS28 3
MTBLS87 3
MTBLS90 3
ST000075 3
ST000354 3
MSV000078568 2
MSV000078598 2
MSV000078839 2
MSV000078936 2
MSV000079450 2
MTBLS103 2
MTBLS125 2
MTBLS127 2
MTBLS140 2
MTBLS19 2
MTBLS213 2
MTBLS229 2
MTBLS24 2
MTBLS265 2
MTBLS266 2
MTBLS273 2
MTBLS289 2
MTBLS315 2
MTBLS32 2
MTBLS341 2
MTBLS364 2
MTBLS38 2
MTBLS40 2
MTBLS424 2
MTBLS74 2
MTBLS79 2
MTBLS88 2
ST000011 2
ST000077 2
ST000091 2
ST000163 2
ST000220 2
ST000284 2
ST000320 2
ST000321 2
ST000326 2
ST000340 2
ST000342 2
ST000382 2
ST000383 2
ST000387 2
ST000403 2
A05001 1
A06001 1
MEX1 1
MEX10 1
MEX101 1
MEX104 1
MEX106 1
MEX11 1
MEX115 1
MEX15 1
MEX16 1
MEX18 1
MEX2 1
MEX27 1
MEX28 1
MEX36 1
MEX42 1
MEX44 1
MEX45 1
MEX5 1
MEX53 1
MEX6 1
MEX66 1
MEX68 1
MEX69 1
MEX70 1
MEX71 1
MEX72 1
MEX74 1
MEX75 1
MEX76 1
MEX77 1
MEX82 1
MEX86 1
MEX90 1
MEX93 1
MEX96 1
MEX97 1
MEX98 1
MEX99 1
MSV000078552 1
MSV000078557 1
MSV000078567 1
MSV000078577 1
MSV000078584 1
MSV000078586 1
MSV000078589 1
MSV000078603 1
MSV000078604 1
MSV000078606 1
MSV000078607 1
MSV000078611 1
MSV000078612 1
MSV000078628 1
MSV000078635 1
MSV000078649 1
MSV000078658 1
MSV000078670 1
MSV000078683 1
MSV000078708 1
MSV000078710 1
MSV000078711 1
MSV000078719 1
MSV000078726 1
MSV000078744 1
MSV000078787 1
MSV000078803 1
MSV000078805 1
MSV000078811 1
MSV000078812 1
MSV000078816 1
MSV000078817 1
MSV000078832 1
MSV000078836 1
MSV000078892 1
MSV000078903 1
MSV000078922 1
MSV000078937 1
MSV000078960 1
MSV000078993 1
MSV000079029 1
MSV000079040 1
MSV000079050 1
MSV000079069 1
MSV000079091 1
MSV000079098 1
MSV000079104 1
MSV000079105 1
MSV000079146 1
MSV000079243 1
MSV000079329 1
MSV000079339 1
MSV000079341 1
MSV000079344 1
MSV000079356 1
MSV000079398 1
MSV000079416 1
MSV000079421 1
MSV000079447 1
MSV000079558 1
MSV000079573 1
MSV000079581 1
MSV000079598 1
MSV000079651 1
MSV000079652 1
MSV000079679 1
MSV000079758 1
MSV000079760 1
MSV000079772 1
MSV000079773 1
MSV000079777 1
MSV000079778 1
MSV000079787 1
MSV000079808 1
MSV000079813 1
MSV000079825 1
MSV000079838 1
MSV000079888 1
MSV000079905 1
MSV000079907 1
MTBLS100 1
MTBLS102 1
MTBLS104 1
MTBLS105 1
MTBLS11 1
MTBLS117 1
MTBLS12 1
MTBLS123 1
MTBLS129 1
MTBLS13 1
MTBLS137 1
MTBLS143 1
MTBLS144 1
MTBLS150 1
MTBLS152 1
MTBLS154 1
MTBLS155 1
MTBLS156 1
MTBLS157 1
MTBLS160 1
MTBLS161 1
MTBLS162 1
MTBLS163 1
MTBLS164 1
MTBLS166 1
MTBLS169 1
MTBLS172 1
MTBLS173 1
MTBLS174 1
MTBLS176 1
MTBLS177 1
MTBLS178 1
MTBLS188 1
MTBLS189 1
MTBLS191 1
MTBLS197 1
MTBLS198 1
MTBLS200 1
MTBLS216 1
MTBLS217 1
MTBLS218 1
MTBLS219 1
MTBLS22 1
MTBLS225 1
MTBLS226 1
MTBLS227 1
MTBLS228 1
MTBLS23 1
MTBLS233 1
MTBLS234 1
MTBLS237 1
MTBLS240 1
MTBLS242 1
MTBLS243 1
MTBLS247 1
MTBLS253 1
MTBLS26 1
MTBLS263 1
MTBLS264 1
MTBLS267 1
MTBLS27 1
MTBLS277 1
MTBLS279 1
MTBLS280 1
MTBLS282 1
MTBLS298 1
MTBLS30 1
MTBLS307 1
MTBLS31 1
MTBLS313 1
MTBLS320 1
MTBLS327 1
MTBLS33 1
MTBLS337 1
MTBLS338 1
MTBLS34 1
MTBLS345 1
MTBLS35 1
MTBLS354 1
MTBLS358 1
MTBLS368 1
MTBLS374 1
MTBLS376 1
MTBLS378 1
MTBLS385 1
MTBLS39 1
MTBLS394 1
MTBLS4 1
MTBLS404 1
MTBLS41 1
MTBLS414 1
MTBLS419 1
MTBLS42 1
MTBLS422 1
MTBLS427 1
MTBLS45 1
MTBLS46 1
MTBLS47 1
MTBLS52 1
MTBLS57 1
MTBLS59 1
MTBLS61 1
MTBLS67 1
MTBLS7 1
MTBLS71 1
MTBLS72 1
MTBLS81 1
MTBLS92 1
MTBLS95 1
MTBLS96 1
ST000001 1
ST000002 1
ST000004 1
ST000009 1
ST000010 1
ST000013 1
ST000016 1
ST000026 1
ST000027 1
ST000028 1
ST000029 1
ST000030 1
ST000031 1
ST000032 1
ST000033 1
ST000034 1
ST000035 1
ST000036 1
ST000037 1
ST000038 1
ST000039 1
ST000041 1
ST000042 1
ST000043 1
ST000044 1
ST000045 1
ST000046 1
ST000047 1
ST000056 1
ST000058 1
ST000061 1
ST000062 1
ST000063 1
ST000065 1
ST000069 1
ST000070 1
ST000071 1
ST000074 1
ST000076 1
ST000081 1
ST000082 1
ST000083 1
ST000087 1
ST000089 1
ST000090 1
ST000092 1
ST000093 1
ST000095 1
ST000096 1
ST000099 1
ST000104 1
ST000105 1
ST000106 1
ST000110 1
ST000111 1
ST000113 1
ST000114 1
ST000115 1
ST000121 1
ST000122 1
ST000133 1
ST000134 1
ST000135 1
ST000137 1
ST000138 1
ST000140 1
ST000142 1
ST000144 1
ST000145 1
ST000146 1
ST000147 1
ST000149 1
ST000150 1
ST000153 1
ST000154 1
ST000158 1
ST000159 1
ST000160 1
ST000161 1
ST000164 1
ST000166 1
ST000168 1
ST000169 1
ST000171 1
ST000172 1
ST000176 1
ST000182 1
ST000188 1
ST000189 1
ST000192 1
ST000193 1
ST000194 1
ST000195 1
ST000196 1
ST000198 1
ST000199 1
ST000201 1
ST000202 1
ST000203 1
ST000206 1
ST000207 1
ST000212 1
ST000213 1
ST000215 1
ST000218 1
ST000221 1
ST000222 1
ST000223 1
ST000224 1
ST000225 1
ST000226 1
ST000230 1
ST000231 1
ST000232 1
ST000233 1
ST000236 1
ST000242 1
ST000245 1
ST000246 1
ST000248 1
ST000249 1
ST000250 1
ST000254 1
ST000255 1
ST000257 1
ST000259 1
ST000261 1
ST000270 1
ST000272 1
ST000273 1
ST000274 1
ST000276 1
ST000278 1
ST000279 1
ST000282 1
ST000283 1
ST000285 1
ST000287 1
ST000288 1
ST000291 1
ST000292 1
ST000293 1
ST000295 1
ST000296 1
ST000298 1
ST000299 1
ST000301 1
ST000302 1
ST000303 1
ST000304 1
ST000306 1
ST000310 1
ST000311 1
ST000313 1
ST000314 1
ST000315 1
ST000316 1
ST000317 1
ST000318 1
ST000319 1
ST000322 1
ST000324 1
ST000325 1
ST000327 1
ST000329 1
ST000330 1
ST000331 1
ST000332 1
ST000336 1
ST000337 1
ST000338 1
ST000341 1
ST000344 1
ST000346 1
ST000352 1
ST000355 1
ST000356 1
ST000367 1
ST000368 1
ST000369 1
ST000370 1
ST000371 1
ST000374 1
ST000375 1
ST000376 1
ST000379 1
ST000380 1
ST000381 1
ST000385 1
ST000386 1
ST000388 1
ST000389 1
ST000390 1
ST000391 1
ST000392 1
ST000396 1
ST000397 1
ST000398 1
ST000404 1
ST000412 1
ST000413 1
ST000419 1
ST000421 1
ST000422 1
ST000425 1
ST000426 1
ST000427 1
ST000428 1
ST000432 1
ST000433 1
ST000434 1
ST000435 1
ST000438 1
ST000439 1
ST000440 1
ST000442 1
ST000443 1
ST000445 1
ST000450 1
ST000451 1
ST000452 1
ST000465 1
ST000477 1
ST000483 1
ST000502 1
ST000510 1
ST000539 1
ST000542 1
ST000543 1